library(tidyverse)
library(magrittr)
library(ngsReports)
library(here)
library(scales)
library(ggpubr)
library(kableExtra)
library(AnnotationHub)
library(ensembldb)
library(edgeR)
library(corrplot)
library(cqn)
library(DT)
library(Gviz)
library(AnnotationFilter)
library(Rsamtools)
if (interactive()) setwd(here::here())
theme_set(theme_bw())
files1 <- list.files(
path = "0_rawData/FastQC",
pattern = "zip",
full.names = TRUE
)
files2 <- list.files(
path = "../20170327_Psen2S4Ter_RNASeq/data/0_rawData/FastQC",
pattern = "zip",
full.names = TRUE
) %>%
str_subset("R1_fastqc.zip")
filesFull <- c(files1, files2)
files <- filesFull %>%
basename()
samples <- tibble(
sample = str_remove(files, "_fastqc.zip"),
dataset = NA,
organism = NA
) %>%
mutate(
dataset = ifelse(
str_detect(sample, "SRR213"), "E-GEOD-71609", dataset
),
dataset = ifelse(
str_detect(sample, "SRR218"), "E-GEOD-72322", dataset
),
dataset = ifelse(
str_detect(sample, "Ps2Ex"), "Psen2S4Ter", dataset
),
sample = ifelse(
str_detect(sample, "Ps2Ex"), str_remove(sample, "_R1"), sample
),
organism = ifelse(
str_detect(sample, "(SRR213|SRR218|Ps2Ex)"), "zebrafish", organism
),
dataset = ifelse(
str_detect(sample, "ERR313"), "E-MTAB-7636", dataset
),
dataset = ifelse(
str_detect(sample, "ERR268"), "E-MTAB-6972", dataset
),
organism = ifelse(
str_detect(sample, "(ERR313|ERR268)"), "mouse", organism
)
)
datasets <- samples$dataset %>%
unique()
ah_Dr <- AnnotationHub() %>%
subset(species == "Danio rerio") %>%
subset(rdataclass == "EnsDb")
ensDb_Dr <- ah_Dr[["AH74989"]]
trEns_Dr <- transcripts(ensDb_Dr) %>%
mcols() %>%
as_tibble()
trLen_Dr <- exonsBy(ensDb_Dr, "tx") %>%
width() %>%
vapply(sum, integer(1))
geneGcLen_Dr <- trLen_Dr %>%
enframe() %>%
set_colnames(c("tx_id", "length")) %>%
left_join(trEns_Dr) %>%
group_by(gene_id) %>%
summarise(
aveLen = mean(length),
maxLen = max(length),
aveGc = sum(length * gc_content) / sum(length),
longestGc = gc_content[which.max(length)[[1]]]
) %>%
mutate(
aveGc = aveGc / 100,
longestGc = longestGc / 100
)
trGcLen_Dr <- trLen_Dr %>%
enframe() %>%
set_colnames(c("tx_id", "length")) %>%
left_join(trEns_Dr) %>%
group_by(tx_id) %>%
summarise(
aveLen = mean(length),
maxLen = max(length),
aveGc = sum(length * gc_content) / sum(length),
longestGc = gc_content[which.max(length)[[1]]]
) %>%
mutate(
aveGc = aveGc / 100,
longestGc = longestGc / 100
)
genesGR_Dr <- genes(ensDb_Dr)
mcols(genesGR_Dr) <- mcols(genesGR_Dr)[c("gene_id", "gene_name",
"gene_biotype", "entrezid")]
txGR_Dr <- transcripts(ensDb_Dr)
mcols(txGR_Dr) <- mcols(txGR_Dr)[c("tx_id", "tx_name",
"tx_biotype", "tx_id_version", "gene_id")]
ah_Mm <- AnnotationHub() %>%
subset(species == "Mus musculus") %>%
subset(rdataclass == "EnsDb")
ensDb_Mm <- ah_Mm[["AH75036"]]
trEns_Mm <- transcripts(ensDb_Mm) %>%
mcols() %>%
as_tibble()
trLen_Mm <- exonsBy(ensDb_Mm, "tx") %>%
width() %>%
vapply(sum, integer(1))
geneGcLen_Mm <- trLen_Mm %>%
enframe() %>%
set_colnames(c("tx_id", "length")) %>%
left_join(trEns_Mm) %>%
group_by(gene_id) %>%
summarise(
aveLen = mean(length),
maxLen = max(length),
aveGc = sum(length * gc_content) / sum(length),
longestGc = gc_content[which.max(length)[[1]]]
) %>%
mutate(
aveGc = aveGc / 100,
longestGc = longestGc / 100
)
trGcLen_Mm <- trLen_Mm %>%
enframe() %>%
set_colnames(c("tx_id", "length")) %>%
left_join(trEns_Mm) %>%
group_by(tx_id) %>%
summarise(
aveLen = mean(length),
maxLen = max(length),
aveGc = sum(length * gc_content) / sum(length),
longestGc = gc_content[which.max(length)[[1]]]
) %>%
mutate(
aveGc = aveGc / 100,
longestGc = longestGc / 100
)
genesGR_Mm <- genes(ensDb_Mm)
mcols(genesGR_Mm) <- mcols(genesGR_Mm)[c("gene_id", "gene_name",
"gene_biotype", "entrezid")]
ah_Hs <- AnnotationHub() %>%
subset(species == "Homo sapiens") %>%
subset(rdataclass == "EnsDb")
ensDb_Hs <- ah_Hs[["AH75011"]]
trEns_Hs <- transcripts(ensDb_Hs) %>%
mcols() %>%
as_tibble()
trLen_Hs <- exonsBy(ensDb_Hs, "tx") %>%
width() %>%
vapply(sum, integer(1))
geneGcLen_Hs <- trLen_Hs %>%
enframe() %>%
set_colnames(c("tx_id", "length")) %>%
left_join(trEns_Hs) %>%
group_by(gene_id) %>%
summarise(
aveLen = mean(length),
maxLen = max(length),
aveGc = sum(length * gc_content) / sum(length),
longestGc = gc_content[which.max(length)[[1]]]
) %>%
mutate(
aveGc = aveGc / 100,
longestGc = longestGc / 100
)
trGcLen_Hs <- trLen_Hs %>%
enframe() %>%
set_colnames(c("tx_id", "length")) %>%
left_join(trEns_Hs) %>%
group_by(tx_id) %>%
summarise(
aveLen = mean(length),
maxLen = max(length),
aveGc = sum(length * gc_content) / sum(length),
longestGc = gc_content[which.max(length)[[1]]]
) %>%
mutate(
aveGc = aveGc / 100,
longestGc = longestGc / 100
)
genesGR_Hs <- genes(ensDb_Hs)
mcols(genesGR_Hs) <- mcols(genesGR_Hs)[c("gene_id", "gene_name",
"gene_biotype", "entrezid")]
An EnsDb object was obtained for Ensembl release 98 using the AnnotationHub package. This provided the GC content and length for every gene and transcript in the release.
Raw data was sourced from publically available datasets on ArrayExpress. Samples from zebrafish (Danio rerio) were chosen for an initial inspection as it is expected that rRNA sequences are more divergent from commonly chosen model organisms such as mice. Hence, inefficient ribosomal RNA (rRNA) depletion by kits that are generally optimised for these common model organisms may produce a stronger signal in zebrafish.
Following an initial inspection, further analysis on human and mouse datasets was performed.
The following analysis involves 92 samples across 5 datasets: E-MTAB-6972, E-MTAB-7636, E-GEOD-71609, E-GEOD-72322, Psen2S4Ter.
rawFqc <- filesFull %>%
FastqcDataList()
data1 <- grepl("SRR213", fqName(rawFqc))
rawLib1 <- plotReadTotals(rawFqc[data1]) +
labs(subtitle = "E-GEOD-71609")
data2 <- grepl("SRR218", fqName(rawFqc))
rawLib2 <- plotReadTotals(rawFqc[data2]) +
labs(subtitle = "E-GEOD-72322")
data3 <- grepl("Ps2Ex", fqName(rawFqc))
labels3 <- rawFqc[data3] %>%
fqName() %>%
str_remove("_6month_07_07_2016_F3") %>%
str_remove("\\.fastq\\.gz") %>%
str_remove("Ps2Ex3M1_")
rawLib3 <- plotReadTotals(rawFqc[data3]) +
labs(subtitle = "Psen2S4Ter") +
scale_x_discrete(labels = labels3)
data4 <- grepl("ERR313", fqName(rawFqc))
rawLib4 <- plotReadTotals(rawFqc[data4]) +
labs(subtitle = "E-MTAB-7636")
data5 <- grepl("ERR268", fqName(rawFqc))
rawLib5 <- plotReadTotals(rawFqc[data5]) +
labs(subtitle = "E-MTAB-6972")
The library sizes of unprocessed datasets ranged between 13,269,891 and 100,459,213 reads.
ggarrange(rawLib1, rawLib2, rawLib3, rawLib4, rawLib5, ncol = 1, nrow = 5)
rRNA transcripts are known to have high GC content. Therefore, surveying the GC content of the raw reads serves as a good starting point for detecting incomplete rRNA removal. A spike in GC content at > 70% is expected if this is the case.
plotly::ggplotly(
plotGcContent(
x = rawFqc[data1],
plotType = "line",
gcType = "Transcriptome",
species = "Drerio"
) +
labs(title = "Dataset 1, E-GEOD-71609 (D. rerio)") +
theme(legend.position="none")
)
GC content of reads in the dataset. No obvious spikes above 70% GC are observed, however unusual distibution shapes in some samples are worth further exploration.
plotly::ggplotly(
plotGcContent(
x = rawFqc[data2],
plotType = "line",
gcType = "Transcriptome",
species = "Drerio"
) +
labs(title = "Dataset 2, E-GEOD-72322 (D. rerio)") +
theme(legend.position="none")
)
GC content of reads in the dataset. No obvious spikes above 70% GC are observed, however unusual distibution shapes in some samples are worth further exploration.
plotly::ggplotly(
plotGcContent(
x = rawFqc[data3],
plotType = "line",
gcType = "Transcriptome",
species = "Drerio"
) +
labs(title = "Dataset 3, Psen2S4Ter (D. rerio)") +
theme(legend.position="none")
)
GC content of reads in the dataset. No obvious spikes above 70% GC are observed, however unusual distibution shapes in some samples are worth further exploration.
plotly::ggplotly(
plotGcContent(
x = rawFqc[data4],
plotType = "line",
gcType = "Transcriptome",
species = "Mmusculus"
) +
labs(title = "Dataset 4, E-MTAB-7636 (M. musculus)") +
theme(legend.position="none")
)
GC content of reads in the dataset. No obvious spikes above 70% GC are observed, however unusual distibution shapes in some samples are worth further exploration.
plotly::ggplotly(
plotGcContent(
x = rawFqc[data5],
plotType = "line",
gcType = "Transcriptome",
species = "Mmusculus"
) +
labs(title = "Dataset 5, E-MTAB-6972 (M. musuculus)") +
theme(legend.position="none")
)
GC content of reads in the dataset. No obvious spikes above 70% GC are observed, however unusual distibution shapes in some samples are worth further exploration.
The top 30 overrepresented sequences were analysed using blastn and were found to be predominantly rRNA sequences.
getModule(rawFqc, "Overrep") %>%
group_by(Sequence, Possible_Source) %>%
summarise(`Found In` = n(), `Highest Percentage` = max(Percentage)) %>%
arrange(desc(`Highest Percentage`), desc(`Found In`)) %>%
ungroup() %>%
dplyr::slice(1:30) %>%
mutate(`Highest Percentage` = percent_format(0.01)(`Highest Percentage`/100)) %>%
kable(
align = "llrr",
caption = paste(
"Top", nrow(.),"Overrepresented sequences.",
"The number of samples they were found in is shown,",
"along with the percentage of the most 'contaminated' sample."
)
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive")
)
| Sequence | Possible_Source | Found In | Highest Percentage |
|---|---|---|---|
| GTGGGTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGC | No Hit | 12 | 1.72% |
| CTCAGACAGGCGTAGCCCCGGGAGGAACCCGGGGCCGCAAGTGCGTTCGAA | No Hit | 13 | 1.43% |
| CCGCTGTATTACTCAGGCTGCACTGCAGTGTCTATTCACAGGCGCGATCC | No Hit | 24 | 1.32% |
| GGCCCGGCGCACGTCCAGAGTCGCCGCCGCACACCGCAGCGCATCCCCCC | No Hit | 9 | 1.31% |
| CTCGGCTCGTGCGTCGATGAAGAACGCAGCTAGCTGCGAGAATTAATGTGA | No Hit | 11 | 1.17% |
| CTCCTGAAAAGGTTGTATCCTTTGTTAAAGGGGCTGTACCCTCTTTAACT | No Hit | 11 | 1.11% |
| GGTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGCATC | No Hit | 12 | 1.09% |
| GGGGTGTACGAAGCTGAACTTTTATTCATCTCCCAGACAACCAGCTATTG | No Hit | 12 | 1.07% |
| GGCCCGGCGCACGTCCAGAGTCGCCGCCGCGCACCGCAGCGCATCCCCCC | No Hit | 10 | 1.03% |
| CCAGGCTGGAGTGCAGTGGCTATTCACAGGCGCGATCCCACTACTGATCAG | No Hit | 25 | 0.85% |
| CCTCCTTCAAGTATTGTTTCATGTTACATTTTCGTATATTCTGGGGTAGA | No Hit | 12 | 0.82% |
| GGGAGTTTGACTGGGGCGGTACACCTGTCAAACGGTAACGCAGGTGTCCT | No Hit | 22 | 0.82% |
| CCCGCTGTATTACTCAGGCTGCACTGCAGTGTCTATTCACAGGCGCGATC | No Hit | 12 | 0.79% |
| GTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGCATCT | No Hit | 12 | 0.79% |
| GGGAGATACCATGATCACGAAGGTGGTTTTCCCAGGGCGAGGCTTATCCAT | No Hit | 8 | 0.78% |
| GGGTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGCAT | No Hit | 12 | 0.78% |
| CGGGTCGGGTGGGTGGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGG | No Hit | 12 | 0.73% |
| AGCAAATATTCAAACGAGAACTTTGAAGGCCGAAGTGGAGAAGGCTTCCA | No Hit | 10 | 0.73% |
| CACAAATTATGCAGTCGAGTTTCCCGCATTTGGGGAAATCGCAGGGGTCAG | No Hit | 7 | 0.69% |
| GGGCCTCTAGCATCTAAAAGCGTATAACAGTTAAAGGGCCGTTTGGCTTT | No Hit | 11 | 0.68% |
| CAAATATTCAAACGAGAACTTTGAAGGCCGAAGTGGAGAAGGCTTCCATG | No Hit | 7 | 0.66% |
| CGACGCTCAGACAGGCGTAGCCCCGGGAGGAACCCGGGGCCGCAAGTGCGT | No Hit | 8 | 0.66% |
| CGCACGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGACAGGAGGATCGCTT | No Hit | 25 | 0.58% |
| CTGGAGTGCAGTGGCTATTCACAGGCGCGATCCCACTACTGATCAGCACGG | No Hit | 23 | 0.58% |
| CGGGTCGGGTGGGTAGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGG | No Hit | 12 | 0.57% |
| CTTAGACGACCTGGTAGTCCAAGGCTCCCCCAGGAGCACCATATCGATAC | No Hit | 19 | 0.54% |
| GTTGGATTGTTCACCCACTAATAGGGAACGTGAGCTGGGTTTAGACCGTC | No Hit | 10 | 0.53% |
| CTCGTTTGGTTTCGGGGTTTCTAGCTGTAATTCTTTTAGTTAGAAGTTTTC | No Hit | 16 | 0.53% |
| AGCTGGGGAGATCCGCGAGAAGGGCCCGGCGCACGTCCAGAGTCGCCGCC | No Hit | 11 | 0.53% |
| GGCCTCTAGCATCTAAAAGCGTATAACAGTTAAAGGGCCGTTTGGCTTTA | No Hit | 10 | 0.53% |
trimFqc1 <- list.files(
path = "1_trimmedData/FastQC",
pattern = "zip",
full.names = TRUE
)
trimFqc2 <- list.files(
path = "../20170327_Psen2S4Ter_RNASeq/data/1_trimmedData/FastQC",
pattern = "zip",
full.names = TRUE
)
trimFqc <- c(trimFqc1, trimFqc2) %>%
FastqcDataList()
trimStats <- readTotals(rawFqc) %>%
dplyr::rename(Raw = Total_Sequences) %>%
left_join(readTotals(trimFqc), by = "Filename") %>%
dplyr::rename(Trimmed = Total_Sequences) %>%
mutate(
Discarded = 1 - Trimmed/Raw,
Retained = Trimmed / Raw
)
After trimming of adapters between 0.00% and 12.91% of reads were discarded.
Trimmed reads were firstly aligned to rRNA sequences using the BWA-MEM algorithm to calculate the proportion of reads that were of rRNA origin within each sample. BWA-MEM is recommended for high-quality queries of reads ranging from 70bp to 1Mbp as it is faster and more accurate that alternative algorithms BWA-backtrack and BWA-SW.
prop1 <- samples %>%
dplyr::filter(dataset == "E-GEOD-71609") %>%
# cbind(tibble(proportion = c(0.0344, 0.0339, 0.0409, 0.0347, 0.0362, 0.0397, 0.0288, 0.0346, 0.0429, 0.0297)))
cbind(tibble(proportion = c(0.0363, 0.0367, 0.0451, 0.0367, 0.0388, 0.0427, 0.0307, 0.0369, 0.0468, 0.0312)))
prop2 <- samples %>%
dplyr::filter(dataset == "E-GEOD-72322") %>%
# cbind(tibble(proportion = c(0.1465, 0.1048, 0.1304, 0.1498, 0.1136, 0.1274, 0.1206, 0.1318, 0.1351, 0.1385, 0.1456, 0.1353, 0.1634, 0.1480, 0.1492, 0.1361, 0.1683, 0.1709, 0.1639, 0.1654, 0.1940, 0.1782, 0.1837, 0.1666)))
cbind(tibble(proportion = c(0.1673, 0.1060, 0.1410, 0.1758, 0.1154, 0.1292, 0.1213, 0.1361, 0.1368, 0.1449, 0.1510, 0.1370, 0.1745, 0.1580, 0.1562, 0.1441, 0.1743, 0.1749, 0.1715, 0.1713, 0.2110, 0.1887, 0.1895, 0.1739)))
prop3 <- samples %>%
dplyr::filter(dataset == "Psen2S4Ter") %>%
cbind(tibble(proportion = c(0.2291, 0.2504, 0.2615, 0.2260, 0.2215, 0.2128, 0.2468, 0.2284, 0.1366, 0.1188, 0.1892, 0.1608))) %>%
mutate(
sample = str_remove(sample, "_6month_07_07_2016_F3"),
sample = str_remove(sample, "Ps2Ex3M1_")
)
prop4 <- samples %>%
dplyr::filter(dataset == "E-MTAB-7636") %>%
cbind(tibble(proportion = c(0.0614, 0.0517, 0.0458, 0.0633, 0.0547, 0.0564, 0.0525, 0.0520, 0.0551, 0.0513, 0.0473, 0.0448, 0.0407, 0.0238, 0.0499, 0.0307, 0.0383, 0.0178, 0.0232, 0.0384, 0.0461, 0.0545, 0.0389, 0.0353, 0.0866, 0.0763)))
prop5 <- samples %>%
dplyr::filter(dataset == "E-MTAB-6972") %>%
cbind(tibble(proportion = c(0.0123, 0.0122, 0.0121, 0.0121, 0.0121, 0.0131, 0.0131, 0.0131, 0.0130, 0.0131, 0.0929, 0.0924, 0.0931, 0.0929, 0.0927, 0.0224, 0.0223, 0.0222, 0.0223, 0.0222)))
rRnaProp <- rbind(prop1, prop2, prop3, prop4, prop5)
rRnaProp$dataset %<>%
factor(levels = c("E-GEOD-71609", "E-GEOD-72322", "Psen2S4Ter", "E-MTAB-7636", "E-MTAB-6972"))
rRnaProp %>%
ggplot(aes(sample, proportion)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~dataset, scales = "free_x") +
scale_y_continuous(labels = percent) +
labs(x = "Sample", y = "Percent of Total") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Percentages of each library that align to rRNA sequences with bwa mem.
Following alignment to rRNA sequences, reads that mapped to rRNA sequences were removed and a separate fastq file was created.
These sequences were aligned to the Danio rerio GRCz11 genome (Ensembl release 98) using STAR 2.7.0d and summarised with featureCounts.
dgeList_1 <- read_tsv("3_alignedDataStar/featureCounts/genes_Dr.out") %>%
set_colnames(basename(colnames(.))) %>%
set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
dplyr::select(str_subset(colnames(.), "SRR213")) %>%
DGEList() %>%
calcNormFactors()
dgeList_2 <- read_tsv("3_alignedDataStar/featureCounts/genes_Dr.out") %>%
set_colnames(basename(colnames(.))) %>%
set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
dplyr::select(str_subset(colnames(.), "SRR218")) %>%
DGEList() %>%
calcNormFactors()
dgeList_3 <- read_tsv("../20170327_Psen2S4Ter_RNASeq/data/2_alignedData/featureCounts/genes.out") %>%
set_colnames(basename(colnames(.))) %>%
set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
set_colnames(str_remove(colnames(.), "_6month_07_07_2016_F3")) %>%
set_colnames(str_remove(colnames(.), "Ps2Ex3M1_")) %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
DGEList() %>%
calcNormFactors()
dgeList_4 <- read_tsv("3_alignedDataStar/featureCounts/genes_Mm.out") %>%
set_colnames(basename(colnames(.))) %>%
set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
dplyr::select(str_subset(colnames(.), "ERR313")) %>%
DGEList() %>%
calcNormFactors()
dgeList_5 <- read_tsv("3_alignedDataStar/featureCounts/genes_Mm.out") %>%
set_colnames(basename(colnames(.))) %>%
set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
dplyr::select(str_subset(colnames(.), "ERR268")) %>%
DGEList() %>%
calcNormFactors()
dgeList_1$genes <- genesGR_Dr[rownames(dgeList_1),]
mcols(dgeList_1$genes) %<>%
as.data.frame() %>%
left_join(geneGcLen_Dr)
dgeList_1$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(rRnaProp) %>%
column_to_rownames("rowname")
dgeList_1$samples$filenames <- read_tsv(
"3_alignedDataStar/featureCounts/genes_Dr.out"
) %>%
dplyr::select(str_subset(colnames(.), "SRR213")) %>%
colnames()
dgeList_2$genes <- genesGR_Dr[rownames(dgeList_2),]
mcols(dgeList_2$genes) %<>%
as.data.frame() %>%
left_join(geneGcLen_Dr)
dgeList_2$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(rRnaProp) %>%
column_to_rownames("rowname")
dgeList_2$samples$filenames <- read_tsv(
"3_alignedDataStar/featureCounts/genes_Dr.out"
) %>%
dplyr::select(str_subset(colnames(.), "SRR218")) %>%
colnames()
dgeList_3$genes <- genesGR_Dr[rownames(dgeList_3),]
mcols(dgeList_3$genes) %<>%
as.data.frame() %>%
left_join(geneGcLen_Dr)
dgeList_3$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(rRnaProp) %>%
column_to_rownames("rowname")
## file paths need to be changed to current filepath
## the original filepath (that featureCounts used) is different to the current
dgeList_3$samples$filenames <- read_tsv(
"../20170327_Psen2S4Ter_RNASeq/data/2_alignedData/featureCounts/genes.out"
) %>%
dplyr::select(str_subset(colnames(.), "Ps2Ex3M1_")) %>%
colnames() %>%
str_replace(., "/2_al", "/data/2_al")
dgeList_4$genes <- genesGR_Mm[rownames(dgeList_4),]
mcols(dgeList_4$genes) %<>%
as.data.frame() %>%
left_join(geneGcLen_Mm)
dgeList_4$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(rRnaProp) %>%
column_to_rownames("rowname")
dgeList_4$samples$filenames <- read_tsv(
"3_alignedDataStar/featureCounts/genes_Mm.out"
) %>%
dplyr::select(str_subset(colnames(.), "ERR313")) %>%
colnames()
dgeList_5$genes <- genesGR_Mm[rownames(dgeList_5),]
mcols(dgeList_5$genes) %<>%
as.data.frame() %>%
left_join(geneGcLen_Mm)
dgeList_5$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(rRnaProp) %>%
column_to_rownames("rowname")
dgeList_5$samples$filenames <- read_tsv(
"3_alignedDataStar/featureCounts/genes_Mm.out"
) %>%
dplyr::select(str_subset(colnames(.), "ERR268")) %>%
colnames()
gcInfo_Dr <- function(x) {
x$counts %>%
as.data.frame() %>%
rownames_to_column("gene_id") %>%
as_tibble() %>%
pivot_longer(
cols = colnames(x),
names_to = "sample",
values_to = "counts"
) %>%
dplyr::filter(
counts > 0
) %>%
left_join(
geneGcLen_Dr
) %>%
dplyr::select(
ends_with("id"), sample, counts, aveGc, maxLen
) %>%
split(f = .$sample) %>%
lapply(
function(x){
DataFrame(
gc = Rle(x$aveGc, x$counts),
logLen = Rle(log10(x$maxLen), x$counts)
)
}
)
}
gcInfo_Mm <- function(x) {
x$counts %>%
as.data.frame() %>%
rownames_to_column("gene_id") %>%
as_tibble() %>%
pivot_longer(
cols = colnames(x),
names_to = "sample",
values_to = "counts"
) %>%
dplyr::filter(
counts > 0
) %>%
left_join(
geneGcLen_Mm
) %>%
dplyr::select(
ends_with("id"), sample, counts, aveGc, maxLen
) %>%
split(f = .$sample) %>%
lapply(
function(x){
DataFrame(
gc = Rle(x$aveGc, x$counts),
logLen = Rle(log10(x$maxLen), x$counts)
)
}
)
}
gcSummary <- function(x) {
x %>%
vapply(function(x){
c(mean(x$gc), sd(x$gc), mean(x$logLen), sd(x$logLen))
}, numeric(4)
) %>%
t() %>%
set_colnames(
c("mn_gc", "sd_gc", "mn_logLen", "sd_logLen")
) %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble()
}
rle_1 <- gcInfo_Dr(dgeList_1)
rle_2 <- gcInfo_Dr(dgeList_2)
rle_3 <- gcInfo_Dr(dgeList_3)
rle_4 <- gcInfo_Mm(dgeList_4)
rle_5 <- gcInfo_Mm(dgeList_5)
sum_1 <- gcSummary(rle_1)
sum_2 <- gcSummary(rle_2)
sum_3 <- gcSummary(rle_3)
sum_4 <- gcSummary(rle_4)
sum_5 <- gcSummary(rle_5)
a_1 <- sum_1 %>%
left_join(rRnaProp) %>%
ggplot(aes(proportion, mn_logLen)) +
geom_point(size = 3) +
geom_smooth(method = "lm") +
scale_x_continuous(labels = percent) +
labs(
x = "rRNA Proportion of Initial Library",
y = "Mean log(Length)"
)
b_1 <- sum_1 %>%
left_join(rRnaProp) %>%
ggplot(aes(proportion, mn_gc)) +
geom_point(size = 3) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
scale_x_continuous(labels = percent) +
labs(
x = "rRNA Proportion of Initial Library",
y = "Mean GC Content"
)
ggarrange(a_1, b_1, ncol = 2, nrow = 1) %>%
annotate_figure("Dataset 1, E-GEOD-71609 (D. rerio)")
Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.
a_2 <- sum_2 %>%
left_join(rRnaProp) %>%
ggplot(aes(proportion, mn_logLen)) +
geom_point(size = 3) +
geom_smooth(method = "lm") +
scale_x_continuous(labels = percent) +
labs(
x = "rRNA Proportion of Initial Library",
y = "Mean log(Length)"
)
b_2 <- sum_2 %>%
left_join(rRnaProp) %>%
ggplot(aes(proportion, mn_gc)) +
geom_point(size = 3) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
scale_x_continuous(labels = percent) +
labs(
x = "rRNA Proportion of Initial Library",
y = "Mean GC Content"
)
ggarrange(a_2, b_2, ncol = 2, nrow = 1) %>%
annotate_figure("Dataset 2, E-GEOD-72322 (D. rerio)")
Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.
a_3 <- sum_3 %>%
left_join(rRnaProp) %>%
ggplot(aes(proportion, mn_logLen)) +
geom_point(size = 3) +
geom_smooth(method = "lm") +
scale_x_continuous(labels = percent) +
labs(
x = "rRNA Proportion of Initial Library",
y = "Mean log(Length)"
)
b_3 <- sum_3 %>%
left_join(rRnaProp) %>%
ggplot(aes(proportion, mn_gc)) +
geom_point(size = 3) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
scale_x_continuous(labels = percent) +
labs(
x = "rRNA Proportion of Initial Library",
y = "Mean GC Content"
)
ggarrange(a_3, b_3, ncol = 2, nrow = 1) %>%
annotate_figure("Dataset 3, PsenS4Ter (D. rerio)")
Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.
a_4 <- sum_4 %>%
left_join(rRnaProp) %>%
ggplot(aes(proportion, mn_logLen)) +
geom_point(size = 3) +
geom_smooth(method = "lm") +
scale_x_continuous(labels = percent) +
labs(
x = "rRNA Proportion of Initial Library",
y = "Mean log(Length)"
)
b_4 <- sum_4 %>%
left_join(rRnaProp) %>%
ggplot(aes(proportion, mn_gc)) +
geom_point(size = 3) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
scale_x_continuous(labels = percent) +
labs(
x = "rRNA Proportion of Initial Library",
y = "Mean GC Content"
)
ggarrange(a_4, b_4, ncol = 2, nrow = 1) %>%
annotate_figure("Dataset 4, E-MTAB-7636 (M. musculus)")
Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.
a_5 <- sum_5 %>%
left_join(rRnaProp) %>%
ggplot(aes(proportion, mn_logLen)) +
geom_point(size = 3) +
geom_smooth(method = "lm") +
scale_x_continuous(labels = percent) +
labs(
x = "rRNA Proportion of Initial Library",
y = "Mean log(Length)"
)
b_5 <- sum_5 %>%
left_join(rRnaProp) %>%
ggplot(aes(proportion, mn_gc)) +
geom_point(size = 3) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
scale_x_continuous(labels = percent) +
labs(
x = "rRNA Proportion of Initial Library",
y = "Mean GC Content"
)
ggarrange(a_5, b_5, ncol = 2, nrow = 1) %>%
annotate_figure("Dataset 5, E-MTAB-6972 (M. musculus)")
Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.
genes2keep_3 <- dgeList_3 %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(6)
dgeFilt_3 <- dgeList_3[genes2keep_3,, keep.lib.sizes = FALSE] %>%
calcNormFactors()
pca_3 <- cpm(dgeFilt_3, log = TRUE) %>%
t() %>%
prcomp()
pcaCor_3 <- pca_3$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(sum_3) %>%
as_tibble() %>%
left_join(rRnaProp) %>%
dplyr::select(
PC1, PC2, PC3,
Mean_GC = mn_gc,
Mean_Length = mn_logLen,
rRna_Proportion = proportion
) %>%
cor()
a_3 <- pca_3$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
ggplot(aes(PC1, PC2)) +
geom_point(size = 2) +
labs(
x = paste0("PC1 (", percent(summary(pca_3)$importance["Proportion of Variance","PC1"]),")"),
y = paste0("PC2 (", percent(summary(pca_3)$importance["Proportion of Variance","PC2"]),")")
)
b_3 <- pca_3$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(rRnaProp) %>%
ggplot(aes(PC1, proportion)) +
geom_point(size = 2) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
labs(
x = paste0("PC1 (", percent(summary(pca_3)$importance["Proportion of Variance","PC1"]),")"),
y = "rRNA Proportion of Initial Library"
)
c_3 <- pca_3$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(sum_3) %>%
as_tibble() %>%
ggplot(aes(PC1, mn_logLen)) +
geom_point(size = 2) +
geom_smooth(method = "lm") +
labs(
x = paste0("PC1 (", percent(summary(pca_3)$importance["Proportion of Variance","PC1"]),")"),
y = "Mean log(Length)"
)
d_3 <- pca_3$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(sum_3) %>%
as_tibble() %>%
ggplot(aes(PC1, mn_gc)) +
geom_point(size = 2) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
labs(
x = paste0("PC1 (", percent(summary(pca_3)$importance["Proportion of Variance","PC1"]),")"),
y = "Mean GC"
)
ggarrange(a_3, b_3, c_3, d_3, ncol = 2, nrow = 2) %>%
annotate_figure("Psen2S4Ter")
PCA plot showing rRNA proportion, mean GC content and mean log(length) after summarisation to gene-level.
corrplot(
pcaCor_3,
type = "lower",
diag = FALSE,
addCoef.col = 1, addCoefasPercent = TRUE
)
Correlations between the first three principal components and measured variables: mean GC content, mean log(length) and rRNA proportion.
design_3 <- model.matrix(~proportion, data = dgeFilt_3$samples)
voom_3 <- voom(dgeFilt_3, design = design_3)
fit_3 <- lmFit(voom_3, design = design_3)
eBayes_3 <- eBayes(fit_3)
topTable_3 <- eBayes_3 %>%
topTable(coef = colnames(design_3)[2], sort.by = "p", n = Inf) %>%
set_colnames(str_remove(colnames(.), "ID\\.")) %>%
mutate(DE = adj.P.Val < 0.05) %>%
unite(Location, c(seqnames, start, end, width, strand), sep = ":") %>%
dplyr::select(
Geneid = gene_id,
Symbol = gene_name,
AveExpr,
logFC,
P.Value,
FDR = adj.P.Val,
Location,
t,
DE,
everything(),
-B
) %>%
as_tibble()
topTable_3 %>%
dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR, DE) %>%
mutate(
AveExpr = format(round(AveExpr, 2), nsmall = 2),
logFC = format(round(logFC, 2), nsmall = 2),
P.Value = sprintf("%.2e", P.Value),
FDR = sprintf("%.2e", FDR)
) %>%
dplyr::slice(1:200) %>%
datatable(options = list(pageLength = 20), class = "striped hover condensed responsive", filter = "top")
dirs_3 <- list.dirs(
"~/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant/",
recursive = FALSE
)
salmon_3 <- catchSalmon(paths = dirs_3)
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Heter_6month_07_07_2016_F3_80_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Heter_6month_07_07_2016_F3_81_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Heter_6month_07_07_2016_F3_84_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Heter_6month_07_07_2016_F3_89_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Hom_6month_07_07_2016_F3_78_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Hom_6month_07_07_2016_F3_82_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Hom_6month_07_07_2016_F3_83_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Hom_6month_07_07_2016_F3_85_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_WT_6month_07_07_2016_F3_79_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_WT_6month_07_07_2016_F3_86_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_WT_6month_07_07_2016_F3_88_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_WT_6month_07_07_2016_F3_95_Fem, 51605 transcripts, 0 bootstraps
dgeListTx_3 <- salmon_3$counts %>%
set_colnames(basename(colnames(.))) %>%
set_colnames(str_remove(colnames(.), "_6month_07_07_2016_F3")) %>%
set_colnames(str_remove(colnames(.), "Ps2Ex3M1_")) %>%
as.data.frame() %>%
rownames_to_column("tx_id") %>%
mutate(tx_id = str_remove(tx_id, "\\.[0-9]+$")) %>%
column_to_rownames("tx_id") %>%
DGEList() %>%
calcNormFactors()
genes2keep_3 <- dgeListTx_3 %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(6)
dgeFiltTx_3 <- dgeListTx_3[genes2keep_3,, keep.lib.sizes = FALSE] %>%
calcNormFactors()
# deIds <- topTable_3 %>%
# dplyr::filter(DE) %>%
# .$Geneid
# notIds <- topTable_3 %>%
# dplyr::filter(!DE) %>%
# .$Geneid
# dna <- ensembldb::getGenomeTwoBitFile(ensDb_Dr)
# deExons <- exonsBy(ensDb_Dr, by = "gene") %>%
# .[deIds] %>%
# lapply(function(x){
# GenomicRanges::reduce(x)
# }) %>%
# GRangesList()
# deStrings <- lapply(deExons, function(x){
# getSeq(dna, x) %>%
# unlist()
# }
# )
# deStringSet <- DNAStringSet(deStrings)
# writeXStringSet(
# deStringSet,
# "/data/biohub/20170327_Psen2S4Ter_RNASeq/data/6_jellyfish/deSeqs.fa"
# )
# conExons <- exonsBy(ensDb_Dr, by = "gene") %>%
# .[conIds] %>%
# lapply(function(x){
# GenomicRanges::reduce(x)
# }) %>%
# GRangesList()
# conStrings <- lapply(conExons, function(x){
# getSeq(dna, x) %>%
# unlist()
# }
# )
# conStringSet <- DNAStringSet(conStrings)
# writeXStringSet(
# conStringSet,
# "/data/biohub/20170327_Psen2S4Ter_RNASeq/data/6_jellyfish/conSeqs.fa"
# )
loadMer <- function(x){
read.table(
paste0(
"/data/biohub/20170327_Psen2S4Ter_RNASeq/data/6_jellyfish/",
x,
"_dumps.txt"
),
col.names = c("mer", "count")
) %>%
dplyr::arrange(-count) %>%
as_tibble()
}
merDek5 <- loadMer("dek5")
merDek6 <- loadMer("dek6")
merDek7 <- loadMer("dek7")
merDek8 <- loadMer("dek8")
merDek9 <- loadMer("dek9")
merDek10 <- loadMer("dek10")
merConk5 <- loadMer("conk5")
merConk6 <- loadMer("conk6")
merConk7 <- loadMer("conk7")
merConk8 <- loadMer("conk8")
merConk9 <- loadMer("conk9")
merConk10 <- loadMer("conk10")
x <- merDek10
y <- merConk10
merRatio <- function(x, y) {
join <- x %>%
full_join(y, by = "mer")
join[is.na(join)] <- 0
join %>%
mutate(
p.x = count.x / sum(count.x),
p.y = count.y / sum(count.y),
ratio = p.x / p.y
) %>%
arrange(desc(ratio))
}
merRatk5 <- merRatio(merDek5, merConk5)
merRatk6 <- merRatio(merDek6, merConk6)
merRatk7 <- merRatio(merDek7, merConk7)
merRatk8 <- merRatio(merDek8, merConk8)
merRatk9 <- merRatio(merDek9, merConk9)
merRatk10 <- merRatio(merDek10, merConk10)
plotRatk5 <- merRatk5 %>%
ggplot(aes(p.x, p.y)) +
geom_point(size = 1) +
labs(x = "DE", y = "Not DE", title = "k = 5") +
scale_x_sqrt() +
scale_y_sqrt() +
geom_abline(slope = 1, intercept = 0, col = "blue")
plotRatk6 <- merRatk6 %>%
ggplot(aes(p.x, p.y)) +
geom_point(size = 1) +
labs(x = "DE", y = "Not DE", title = "k = 6") +
scale_x_sqrt() +
scale_y_sqrt() +
geom_abline(slope = 1, intercept = 0, col = "blue")
plotRatk7 <- merRatk7 %>%
ggplot(aes(p.x, p.y)) +
geom_point(size = 1) +
labs(x = "DE", y = "Not DE", title = "k = 7") +
scale_x_sqrt() +
scale_y_sqrt() +
geom_abline(slope = 1, intercept = 0, col = "blue")
plotRatk8 <- merRatk8 %>%
ggplot(aes(p.x, p.y)) +
geom_point(size = 1) +
labs(x = "DE", y = "Not DE", title = "k = 8") +
scale_x_sqrt() +
scale_y_sqrt() +
geom_abline(slope = 1, intercept = 0, col = "blue")
plotRatk9 <- merRatk9 %>%
ggplot(aes(p.x, p.y)) +
geom_point(size = 1) +
labs(x = "DE", y = "Not DE", title = "k = 9") +
scale_x_sqrt() +
scale_y_sqrt() +
geom_abline(slope = 1, intercept = 0, col = "blue")
plotRatk10 <- merRatk10 %>%
ggplot(aes(p.x, p.y)) +
geom_point(size = 1) +
labs(x = "DE", y = "Not DE", title = "k = 10") +
scale_x_sqrt() +
scale_y_sqrt() +
geom_abline(slope = 1, intercept = 0, col = "blue")
plotRat <- ggarrange(plotRatk5, plotRatk6, plotRatk7, plotRatk8,
plotRatk9, plotRatk10, nrow = 3, ncol = 2)
annotate_figure(
plotRat,
bottom = "Square root ratio of DE gene kmer counts vs not DE gene kmer counts"
)
nBins <- list(length = 10, gc = 10)
binTable_3 <- topTable_3 %>%
dplyr::select(-gene_biotype, entrezid) %>%
mutate(
lengthBins = cut(
log(aveLen),
breaks = quantile(
log(aveLen), seq(0, nBins$length)/nBins$length
),
labels = paste0("L", seq_len(nBins$length)),
include.lowest = TRUE
),
gcBins = cut(
aveGc,
breaks = quantile(
aveGc, seq(0, nBins$gc) / nBins$gc
),
labels = paste0("GC", seq_len(nBins$gc)),
include.lowest = TRUE
),
bothBins = paste(lengthBins, gcBins, sep = "_"),
bothBins = as.factor(bothBins)
) %>%
group_by(bothBins) %>%
mutate(n = n()) %>%
ungroup() %>%
dplyr::filter(n > 1) %>%
as_tibble()